University of Sri Jayewardenepura, Sri Lanka
June 4, 2025
(Based on 2019 data)
Favorable climate that speed up the mosquito life cycle
Warm temperatures year-round
High humidity and frequent rainfall
No harsh winters
Many equatorial countries have rapidly growing urban areas with poor waste and water management.
People often live and work in open environments with limited protection, increasing exposure.
Sri Lanka
library(dplyr)
library(ggplot2)
library(lubridate)
library(tsibble)
library(tidyverse)
library(lubridate)
library(plotly)
# Step 1: Aggregate weekly cases across all districts
data("srilanka_weekly_data")
srilanka_weekly_data <- srilanka_weekly_data[-which(srilanka_weekly_data$year == 2023 & srilanka_weekly_data$week == 52), ]
srilanka_weekly_data[which(srilanka_weekly_data$start.date == "12/26/2020"), ]$cases <- c(35, 17, 18, 20, 2, 0, 3, 2, 7, 6, 1, 1, 0, 0, 208, 0, 0, 12, 8, 4, 0, 0, 0, 2, 3, 2)
country_weekly <- srilanka_weekly_data |>
group_by(year, week, start.date) %>%
summarise(total_cases = sum(cases, na.rm = TRUE), .groups = 'drop') |>
arrange(start.date)
# Step 2: Plot the time series
country_weekly <- country_weekly |>
mutate(
yearweek = yearweek(start.date)) |>
distinct(yearweek, .keep_all = TRUE)
country_weekly_tsibble <- country_weekly |>
as_tsibble(index = yearweek)
p1 <- ggplot(country_weekly_tsibble, aes(x = yearweek, y = total_cases)) +
geom_line() +
scale_x_yearweek(date_breaks = "1 year", date_labels = "%Y") +
labs(
# title = "Weekly Dengue Cases in Sri Lanka",
# subtitle = "From National Aggregated Data",
x = "Year",
y = "Weekly Dengue Cases"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold")
)
ggplotly(p1)Red Segment: Interrupted Period Due to the COVID-19 Pandemic
denguedatahub R packagedenguedatahub R packagelink: https://denguedatahub.netlify.app/
Approach 1
Excluding the interrupted period from model training
Approach 2
Forecasting the interrupted period first and then using it for modeling
Approach 3
Down-weighting interrupted period
Benchmark
#data("srilanka_weekly_data")
df <- srilanka_weekly_data |>
mutate(yearweek = yearweek(start.date))
duplicaterws <- df |>
mutate(
yearweek = yearweek(start.date)) |>
duplicates(key = district, index = yearweek)
df_tsibble <- df |>
distinct(district, yearweek, .keep_all = TRUE) |>
as_tsibble(index = yearweek, key = district)df_tsibble |> head()
# A tsibble: 6 x 7 [1W]
# Key: district [1]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2006 52 12/23/2006 12/29/2006 Ampara 0 2006 W51
2 2007 1 12/30/2006 1/5/2007 Ampara 0 2006 W52
3 2007 2 1/6/2007 1/12/2007 Ampara 0 2007 W01
4 2007 3 1/13/2007 1/19/2007 Ampara 0 2007 W02
5 2007 4 1/20/2007 1/26/2007 Ampara 0 2007 W03
6 2007 5 1/27/2007 2/2/2007 Ampara 0 2007 W04df_tsibble |> tail()
# A tsibble: 6 x 7 [1W]
# Key: district [1]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2025 6 1/25/2025 1/31/2025 Vavuniya 3 2025 W04
2 2025 7 2/1/2025 2/7/2025 Vavuniya 2 2025 W05
3 2025 8 2/8/2025 2/14/2025 Vavuniya 2 2025 W06
4 2025 9 2/15/2025 2/21/2025 Vavuniya 0 2025 W07
5 2025 10 2/22/2025 2/28/2025 Vavuniya 5 2025 W08
6 2025 11 3/1/2025 3/7/2025 Vavuniya 0 2025 W09Training set
train_tsibble <- df_tsibble |>
filter(year(yearweek) < 2025)
train_tsibble
# A tsibble: 24,466 x 7 [1W]
# Key: district [26]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2006 52 12/23/2006 12/29/2006 Ampara 0 2006 W51
2 2007 1 12/30/2006 1/5/2007 Ampara 0 2006 W52
3 2007 2 1/6/2007 1/12/2007 Ampara 0 2007 W01
4 2007 3 1/13/2007 1/19/2007 Ampara 0 2007 W02
5 2007 4 1/20/2007 1/26/2007 Ampara 0 2007 W03
6 2007 5 1/27/2007 2/2/2007 Ampara 0 2007 W04
7 2007 6 2/3/2007 2/9/2007 Ampara 0 2007 W05
8 2007 7 2/10/2007 2/16/2007 Ampara 0 2007 W06
9 2007 8 2/17/2007 2/23/2007 Ampara 0 2007 W07
10 2007 9 2/24/2007 3/2/2007 Ampara 1 2007 W08
# ℹ 24,456 more rowsTest set
test_tsibble <- df_tsibble |>
filter(year(yearweek) == 2025)
test_tsibble
# A tsibble: 234 x 7 [1W]
# Key: district [26]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2025 3 1/4/2025 1/10/2025 Ampara 6 2025 W01
2 2025 4 1/11/2025 1/17/2025 Ampara 7 2025 W02
3 2025 5 1/18/2025 1/24/2025 Ampara 3 2025 W03
4 2025 6 1/25/2025 1/31/2025 Ampara 4 2025 W04
5 2025 7 2/1/2025 2/7/2025 Ampara 3 2025 W05
6 2025 8 2/8/2025 2/14/2025 Ampara 3 2025 W06
7 2025 9 2/15/2025 2/21/2025 Ampara 5 2025 W07
8 2025 10 2/22/2025 2/28/2025 Ampara 4 2025 W08
9 2025 11 3/1/2025 3/7/2025 Ampara 0 2025 W09
10 2025 3 1/4/2025 1/10/2025 Anuradhapura 20 2025 W01
# ℹ 224 more rows# A mable: 26 x 2
# Key: district [26]
district arima
<chr> <model>
1 Ampara <ARIMA(0,1,2)>
2 Anuradhapura <ARIMA(0,1,4)>
3 Badulla <ARIMA(2,1,3)(0,0,1)[52]>
4 Batticaloa <ARIMA(2,1,3)(0,0,1)[52]>
5 Colombo <ARIMA(0,1,2)(0,0,1)[52]>
6 Galle <ARIMA(0,1,3)(0,0,1)[52]>
7 Gampaha <ARIMA(0,1,2)>
8 Hambanthota <ARIMA(0,1,1)(0,0,1)[52]>
9 Jaffna <ARIMA(0,1,5)>
10 Kalmune <ARIMA(0,1,0)>
# ℹ 16 more rows
train_tsibble_excludecovid |> head()
# A tsibble: 6 x 7 [1W]
# Key: district [1]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2023 2 1/7/2023 1/13/2023 Ampara 7 2023 W01
2 2023 3 1/14/2023 1/20/2023 Ampara 8 2023 W02
3 2023 4 1/21/2023 1/27/2023 Ampara 1 2023 W03
4 2023 5 1/28/2023 2/3/2023 Ampara 1 2023 W04
5 2023 6 2/4/2023 2/10/2023 Ampara 7 2023 W05
6 2023 7 2/11/2023 2/17/2023 Ampara 1 2023 W06
train_tsibble_excludecovid |> tail()
# A tsibble: 6 x 7 [1W]
# Key: district [1]
year week start.date end.date district cases yearweek
<dbl> <dbl> <chr> <chr> <chr> <dbl> <week>
1 2024 49 11/23/2024 11/29/2024 Vavuniya 3 2024 W47
2 2024 50 11/30/2024 12/6/2024 Vavuniya 2 2024 W48
3 2024 51 12/7/2024 12/13/2024 Vavuniya 4 2024 W49
4 2024 52 12/14/2024 12/20/2024 Vavuniya 4 2024 W50
5 2025 1 12/21/2024 12/27/2024 Vavuniya 2 2024 W51
6 2025 2 12/28/2024 1/3/2025 Vavuniya 4 2024 W52# A mable: 26 x 2
# Key: district [26]
district arima
<chr> <model>
1 Ampara <ARIMA(1,0,3) w/ mean>
2 Anuradhapura <ARIMA(0,0,3) w/ mean>
3 Badulla <ARIMA(1,0,1) w/ mean>
4 Batticaloa <ARIMA(0,1,0)>
5 Colombo <ARIMA(3,0,1) w/ mean>
6 Galle <ARIMA(0,1,1)>
7 Gampaha <ARIMA(0,1,1)>
8 Hambanthota <ARIMA(0,1,1)>
9 Jaffna <ARIMA(1,0,2)>
10 Kalmune <ARIMA(3,1,0)>
# ℹ 16 more rows
Step 1: Forecasting interrupted period
Step 2: Replace interrupted period with forecasts
train_tsibble_updated
# A tsibble: 24,466 x 8 [1W]
# Key: district [26]
year week start.date end.date district yearweek .model cases
<dbl> <dbl> <chr> <chr> <chr> <week> <chr> <dbl>
1 2006 52 12/23/2006 12/29/2006 Ampara 2006 W51 <NA> 0
2 2007 1 12/30/2006 1/5/2007 Ampara 2006 W52 <NA> 0
3 2007 2 1/6/2007 1/12/2007 Ampara 2007 W01 <NA> 0
4 2007 3 1/13/2007 1/19/2007 Ampara 2007 W02 <NA> 0
5 2007 4 1/20/2007 1/26/2007 Ampara 2007 W03 <NA> 0
6 2007 5 1/27/2007 2/2/2007 Ampara 2007 W04 <NA> 0
7 2007 6 2/3/2007 2/9/2007 Ampara 2007 W05 <NA> 0
8 2007 7 2/10/2007 2/16/2007 Ampara 2007 W06 <NA> 0
9 2007 8 2/17/2007 2/23/2007 Ampara 2007 W07 <NA> 0
10 2007 9 2/24/2007 3/2/2007 Ampara 2007 W08 <NA> 1
# ℹ 24,456 more rowsStep 3: Use updated training set for forecasting
tb_all_ARIMA_updatedcovid
# A mable: 26 x 2
# Key: district [26]
district arima
<chr> <model>
1 Ampara <ARIMA(0,1,2)(0,0,2)[52]>
2 Anuradhapura <ARIMA(3,1,3)>
3 Badulla <ARIMA(1,1,4)>
4 Batticaloa <ARIMA(0,1,1)(0,0,1)[52]>
5 Colombo <ARIMA(0,1,3)(0,0,1)[52]>
6 Galle <ARIMA(0,1,3)(0,0,1)[52]>
7 Gampaha <ARIMA(0,1,2)>
8 Hambanthota <ARIMA(0,1,1)(0,0,1)[52]>
9 Jaffna <ARIMA(1,1,2)>
10 Kalmune <ARIMA(0,1,0)>
# ℹ 16 more rowsBenchmark: Without considering the interruption effect
Approach 1: Excluding the interrupted period from model training
Approach 2: Forecasting the interrupted period first and then using it for modeling
\[forecast_{Approach 3} = 0.7 \times forecast_{\text{excluded}} + 0.3 \times forecast_{\text{included without extimating}}\]
Output only for fcb_accuracy for illustration
fcb_accuracy
# A tibble: 26 × 11
.model district .type ME RMSE MAE MPE MAPE MASE RMSSE
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 arima Ampara Test 0.419 1.95 1.50 -Inf Inf NaN NaN
2 arima Anuradhapura Test 4.98 9.37 6.78 12.0 22.8 NaN NaN
3 arima Badulla Test 12.9 15.2 12.9 64.5 66.1 NaN NaN
4 arima Batticaloa Test -0.00937 7.56 5.14 -1.35 7.95 NaN NaN
5 arima Colombo Test 38.4 43.3 38.4 15.2 15.2 NaN NaN
6 arima Galle Test 21.1 25.8 22.1 38.6 43.3 NaN NaN
7 arima Gampaha Test -62.5 68.1 62.5 -45.4 45.4 NaN NaN
8 arima Hambanthota Test -7.47 10.8 9.77 -59.8 66.7 NaN NaN
9 arima Jaffna Test -20.4 22.6 20.4 -76.0 76.0 NaN NaN
10 arima Kalmune Test -41 41.2 41 -355. 355. NaN NaN
# ℹ 16 more rows
# ℹ 1 more variable: ACF1 <dbl>Benchmark <- fcb_accuracy$RMSE
Excluded <- fc_a1_accuracy$RMSE
UpdatedCovid <- fc_a2_accuracy$RMSE
rmse_by_district <- fc_a3 |>
group_by(district) |>
summarise(RMSE = sqrt(mean((actual - predicted)^2)), .groups = "drop")
Downweight <- rmse_by_district$RMSE
rmse_tbl <- tibble(Benchmark,Excluded, UpdatedCovid, Downweight)
rmse_tbl
# A tibble: 26 × 4
Benchmark Excluded UpdatedCovid Downweight
<dbl> <dbl> <dbl> <dbl>
1 1.95 2.13 2.12 2.12
2 9.37 12.4 8.73 11.6
3 15.2 9.97 11.6 9.68
4 7.56 7.87 8.57 7.72
5 43.3 42.4 43.9 26.4
6 25.8 23.8 24.6 24.2
7 68.1 32.6 70.0 42.2
8 10.8 11.1 10.8 11.2
9 22.6 8.64 22.3 8.56
10 41.2 29.6 41.2 33.1
# ℹ 16 more rows| Approach | mean_RMSE | median_RMSE | sd_RMSE | min_RMSE | max_RMSE |
|---|---|---|---|---|---|
| Benchmark | 15.4 | 10.3 | 15.6 | 1.95 | 68.1 |
| Downweight | 12.4 | 10.1 | 10.2 | 1.66 | 42.2 |
| Excluded | 12.6 | 10.0 | 10.4 | 1.56 | 42.4 |
| UpdatedCovid | 15.6 | 10.4 | 15.8 | 2.12 | 70.0 |
| Approach | mean_MAE | median_MAE | sd_MAE | min_MAE | max_MAE |
|---|---|---|---|---|---|
| Benchmark | 13.7 | 8.14 | 14.65 | 1.50 | 62.5 |
| Downweight | 10.6 | 8.62 | 9.31 | 1.46 | 36.7 |
| Excluded | 10.8 | 8.72 | 9.33 | 1.15 | 35.5 |
| UpdatedCovid | 13.9 | 8.35 | 14.87 | 1.74 | 64.5 |
| Approach | mean_MAE | median_MAE | sd_MAE | min_MAE | max_MAE |
|---|---|---|---|---|---|
| Benchmark | 13.7 | 8.14 | 14.65 | 1.50 | 62.5 |
| Downweight | 10.6 | 8.62 | 9.31 | 1.46 | 36.7 |
| Excluded | 10.8 | 8.72 | 9.33 | 1.15 | 35.5 |
| UpdatedCovid | 13.9 | 8.35 | 14.87 | 1.74 | 64.5 |
# A tibble: 26 × 6
Benchmark Excluded UpdatedCovid Downweight Best District
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 1.95 2.13 2.12 2.12 Benchmark Ampara
2 9.37 12.4 8.73 11.6 UpdatedCovid Anuradhapura
3 15.2 9.97 11.6 9.68 Downweight Badulla
4 7.56 7.87 8.57 7.72 Benchmark Batticaloa
5 43.3 42.4 43.9 26.4 Downweight Colombo
6 25.8 23.8 24.6 24.2 Excluded Galle
7 68.1 32.6 70.0 42.2 Excluded Gampaha
8 10.8 11.1 10.8 11.2 Benchmark Hambanthota
9 22.6 8.64 22.3 8.56 Downweight Jaffna
10 41.2 29.6 41.2 33.1 Excluded Kalmune
# ℹ 16 more rows
library(sf)
rmse_tbl <- rmse_tbl |>
mutate(join_key = substr(District, 1, 5))
lka_adm2 <- lka_adm2 |>
mutate(join_key = substr(NAME, 1, 5))
# left join to preserve sf geometry from lka_adm2
combined_sf <- lka_adm2 %>%
left_join(rmse_tbl, by = "join_key")
p1 <- ggplot(combined_sf ) +
geom_sf(aes(fill=Best)) + scale_fill_brewer(palette = "Dark2")
p1